{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "感觉书中的代码有问题，要不就是书里的文字表述有问题。  \n",
    "说是属性对的相关性，但实际的代码却是样本对（行）之间的相关性。  \n",
    "所以后面会先按照书中的描述，对行（样本）求相关系数，然后再对列（属性）求相关系数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 载入数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import sqrt\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签，字体名称为win中中文字体对应的英文名\n",
    "plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_url = (\"https://archive.ics.uci.edu/ml/machine-learning-\"\n",
    "\"databases/undocumented/connectionist-bench/sonar/sonar.all-data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>V0</th>\n",
       "      <th>V1</th>\n",
       "      <th>V2</th>\n",
       "      <th>V3</th>\n",
       "      <th>V4</th>\n",
       "      <th>V5</th>\n",
       "      <th>V6</th>\n",
       "      <th>V7</th>\n",
       "      <th>V8</th>\n",
       "      <th>V9</th>\n",
       "      <th>...</th>\n",
       "      <th>V51</th>\n",
       "      <th>V52</th>\n",
       "      <th>V53</th>\n",
       "      <th>V54</th>\n",
       "      <th>V55</th>\n",
       "      <th>V56</th>\n",
       "      <th>V57</th>\n",
       "      <th>V58</th>\n",
       "      <th>V59</th>\n",
       "      <th>V60</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0200</td>\n",
       "      <td>0.0371</td>\n",
       "      <td>0.0428</td>\n",
       "      <td>0.0207</td>\n",
       "      <td>0.0954</td>\n",
       "      <td>0.0986</td>\n",
       "      <td>0.1539</td>\n",
       "      <td>0.1601</td>\n",
       "      <td>0.3109</td>\n",
       "      <td>0.2111</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0027</td>\n",
       "      <td>0.0065</td>\n",
       "      <td>0.0159</td>\n",
       "      <td>0.0072</td>\n",
       "      <td>0.0167</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0084</td>\n",
       "      <td>0.0090</td>\n",
       "      <td>0.0032</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0453</td>\n",
       "      <td>0.0523</td>\n",
       "      <td>0.0843</td>\n",
       "      <td>0.0689</td>\n",
       "      <td>0.1183</td>\n",
       "      <td>0.2583</td>\n",
       "      <td>0.2156</td>\n",
       "      <td>0.3481</td>\n",
       "      <td>0.3337</td>\n",
       "      <td>0.2872</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0084</td>\n",
       "      <td>0.0089</td>\n",
       "      <td>0.0048</td>\n",
       "      <td>0.0094</td>\n",
       "      <td>0.0191</td>\n",
       "      <td>0.0140</td>\n",
       "      <td>0.0049</td>\n",
       "      <td>0.0052</td>\n",
       "      <td>0.0044</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0262</td>\n",
       "      <td>0.0582</td>\n",
       "      <td>0.1099</td>\n",
       "      <td>0.1083</td>\n",
       "      <td>0.0974</td>\n",
       "      <td>0.2280</td>\n",
       "      <td>0.2431</td>\n",
       "      <td>0.3771</td>\n",
       "      <td>0.5598</td>\n",
       "      <td>0.6194</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0232</td>\n",
       "      <td>0.0166</td>\n",
       "      <td>0.0095</td>\n",
       "      <td>0.0180</td>\n",
       "      <td>0.0244</td>\n",
       "      <td>0.0316</td>\n",
       "      <td>0.0164</td>\n",
       "      <td>0.0095</td>\n",
       "      <td>0.0078</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0100</td>\n",
       "      <td>0.0171</td>\n",
       "      <td>0.0623</td>\n",
       "      <td>0.0205</td>\n",
       "      <td>0.0205</td>\n",
       "      <td>0.0368</td>\n",
       "      <td>0.1098</td>\n",
       "      <td>0.1276</td>\n",
       "      <td>0.0598</td>\n",
       "      <td>0.1264</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0121</td>\n",
       "      <td>0.0036</td>\n",
       "      <td>0.0150</td>\n",
       "      <td>0.0085</td>\n",
       "      <td>0.0073</td>\n",
       "      <td>0.0050</td>\n",
       "      <td>0.0044</td>\n",
       "      <td>0.0040</td>\n",
       "      <td>0.0117</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0762</td>\n",
       "      <td>0.0666</td>\n",
       "      <td>0.0481</td>\n",
       "      <td>0.0394</td>\n",
       "      <td>0.0590</td>\n",
       "      <td>0.0649</td>\n",
       "      <td>0.1209</td>\n",
       "      <td>0.2467</td>\n",
       "      <td>0.3564</td>\n",
       "      <td>0.4459</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0031</td>\n",
       "      <td>0.0054</td>\n",
       "      <td>0.0105</td>\n",
       "      <td>0.0110</td>\n",
       "      <td>0.0015</td>\n",
       "      <td>0.0072</td>\n",
       "      <td>0.0048</td>\n",
       "      <td>0.0107</td>\n",
       "      <td>0.0094</td>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 61 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       V0      V1      V2      V3      V4      V5      V6      V7      V8  \\\n",
       "0  0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   \n",
       "1  0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   \n",
       "2  0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   \n",
       "3  0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   \n",
       "4  0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   \n",
       "\n",
       "       V9  ...     V51     V52     V53     V54     V55     V56     V57  \\\n",
       "0  0.2111  ...  0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   \n",
       "1  0.2872  ...  0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   \n",
       "2  0.6194  ...  0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   \n",
       "3  0.1264  ...  0.0121  0.0036  0.0150  0.0085  0.0073  0.0050  0.0044   \n",
       "4  0.4459  ...  0.0031  0.0054  0.0105  0.0110  0.0015  0.0072  0.0048   \n",
       "\n",
       "      V58     V59  V60  \n",
       "0  0.0090  0.0032    R  \n",
       "1  0.0052  0.0044    R  \n",
       "2  0.0095  0.0078    R  \n",
       "3  0.0040  0.0117    R  \n",
       "4  0.0107  0.0094    R  \n",
       "\n",
       "[5 rows x 61 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try:\n",
    "    # 第一个参数比较灵活，可以是url,也可是文件路径，或者IO等。\n",
    "    df_sonar = pd.read_csv(\"../../data/sonar.csv\", header=0)\n",
    "except Exception as e:\n",
    "    print(e)\n",
    "    df_sonar = pd.read_csv(target_url, header=None, prefix='V')\n",
    "    df_sonar.to_csv(\"../../data/sonar.csv\", index=False)\n",
    "\n",
    "df_sonar.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 行（样本对）之间的相关系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "nrows, ncols = df_sonar.shape  # ncols里包含了target列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第2行、第3行和第21行的样本值\n",
    "data_row2 = df_sonar.iloc[1, :-1]\n",
    "data_row3 = df_sonar.iloc[2, 0:60]\n",
    "data_row21 = df_sonar.iloc[20, :-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算每一行的均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean2 = 0.0 \n",
    "mean3 = 0.0\n",
    "mean21 = 0.0\n",
    "length = len(data_row2)\n",
    "for i in range(length):\n",
    "    mean2 += data_row2[i]/length\n",
    "    mean3 += data_row3[i]/length\n",
    "    mean21 += data_row21[i]/length"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算每一行的方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "var2 = 0.0 \n",
    "var3 = 0.0\n",
    "var21 = 0.0\n",
    "length = len(data_row2)\n",
    "for i in range(length):\n",
    "    var2 += (data_row2[i] - mean2)**2 / length\n",
    "    var3 += (data_row3[i] - mean3)**2 / length\n",
    "    var21 += (data_row21[i] - mean21)**2 / length"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算样本对的相关性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr2_3 = 0.0\n",
    "corr2_21 = 0.0\n",
    "length = len(data_row2)\n",
    "for i in range(length):\n",
    "    corr2_3 += (data_row2[i] - mean2)*(data_row3[i] - mean3) / (sqrt(var2*var3) * length)\n",
    "    corr2_21 += (data_row2[i] - mean2)*(data_row21[i] - mean21) / (sqrt(var2*var21) * length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation between Sample 2 and 3: \n",
      "0.7709381211911223\n",
      "\n",
      "Correlation between Sample 2 and 21: \n",
      "0.46654808078868865\n"
     ]
    }
   ],
   "source": [
    "print(\"Correlation between Sample 2 and 3: \")\n",
    "print(corr2_3)\n",
    "\n",
    "print()\n",
    "\n",
    "print(\"Correlation between Sample 2 and 21: \")\n",
    "print(corr2_21)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 列（属性对）之间的相关性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第2列、第3列和第21列的样本值\n",
    "data_col2 = df_sonar.iloc[:, 1]\n",
    "data_col3 = df_sonar.iloc[:, 2]\n",
    "data_col21 = df_sonar.iloc[:, 20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算每一列的均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean2 = 0.0 \n",
    "mean3 = 0.0\n",
    "mean21 = 0.0\n",
    "length = len(data_col2)\n",
    "for i in range(length):\n",
    "    mean2 += data_col2[i]/length\n",
    "    mean3 += data_col3[i]/length\n",
    "    mean21 += data_col21[i]/length"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算每一列的方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "var2 = 0.0 \n",
    "var3 = 0.0\n",
    "var21 = 0.0\n",
    "length = len(data_col2)\n",
    "for i in range(length):\n",
    "    var2 += (data_col2[i] - mean2)**2 / length\n",
    "    var3 += (data_col3[i] - mean3)**2 / length\n",
    "    var21 += (data_col21[i] - mean21)**2 / length"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 计算属性对之间的相关性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr2_3 = 0.0\n",
    "corr2_21 = 0.0\n",
    "length = len(data_col2)\n",
    "for i in range(length):\n",
    "    corr2_3 += (data_col2[i] - mean2)*(data_col3[i] - mean3) / (sqrt(var2*var3) * length)\n",
    "    corr2_21 += (data_col2[i] - mean2)*(data_col21[i] - mean21) / (sqrt(var2*var21) * length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation between Attribute 2 and 3: \n",
      "0.7799158719104267\n",
      "\n",
      "Correlation between Attribute 2 and 21: \n",
      "0.07525528679117012\n"
     ]
    }
   ],
   "source": [
    "print(\"Correlation between Attribute 2 and 3: \")\n",
    "print(corr2_3)\n",
    "\n",
    "print()\n",
    "\n",
    "print(\"Correlation between Attribute 2 and 21: \")\n",
    "print(corr2_21)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
